#----------------------------------------------------------#
# Pseudo id - Managing parquet files #
#----------------------------------------------------------#

  # Run this file preliminary to S2_pseudo_id.sas to import 2021 to 2023 parquet files for
  # further chaining. 

#----------------------------------------------------------#
# Settings ####
#----------------------------------------------------------#
  
  # Packages & library 
  mypackages <- c("data.table","haven","arrow")
  toinstall <- mypackages [!mypackages %in% installed.packages()[,"Package"]]
  if(length(toinstall)) install.packages(toinstall)
  
  library("data.table")
  library("haven")
  library("arrow")
  
  # Workind directory
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
  
  # Creates a parquet_dta to save the temporary dta files
  if (!dir.exists("parquet_dta")) {dir.create("parquet_dta")}

  # Current parquet folders
  lib2021 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2021\\Format parquet\\"
  lib2022 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2022\\"
  lib2023 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2023\\"
  
  # Future parquet folders
  # lib2024 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2024\\"
  # lib2025 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2025\\"
  # lib2026 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2026\\"
  
#----------------------------------------------------------#
# Import function for both year t of yearfile (y-1) and year (t-1) of year file y ####
#----------------------------------------------------------#
    parquet_import <- function(y=y) {
      
      y_1 <- (y-1)
      liby_1t <- get(paste("lib",y_1,sep=""))
      files_y_1t <- list.files(path = liby_1t)
      files_y_1t <- files_y_1t[grep(paste("post_",y_1,sep=""),files_y_1t)]
      end <- length(files_y_1t)
      
      dads_y_1t <- NULL
      print(liby_1t)
      
      for (i in c(1:end)){
            
           ff <- read_parquet(paste(liby_1t,files_y_1t[i],sep=""),
                               col_select = c(sexe, siren_empl, nic_empl, nbheur, datdeb, datfin, 
                                              duree, comr, comt, sonde, regt, s_brut, 
                                              ident_s, annee_naiss,pps,dept))
           ff <- ff[s_brut>0 & nbheur>0 & annee_naiss>0  & (siren_empl %in% "000000000")==F,]
           ff$s_brut <- round(ff$s_brut,0)
           dads_y_1t <- rbind(dads_y_1t,ff,fill=T)
           print(files_y_1t[i])
           print(nrow(ff))
           rm(ff)
      }
      dads_y_1t <- dads_y_1t[order(regt,ident_s),]
      row.names(dads_y_1t) <- NULL
      write_dta(dads_y_1t,paste("parquet_dta/dads_",y_1,".dta",sep=""), version= 14)
      print(nrow(dads_y_1t))
      print(str(dads_y_1t))
      
      rm(dads_y_1t)
      
      libyt_1 <- get(paste("lib",y,sep=""))
      files_yt_1 <- list.files(path = libyt_1)
      files_yt_1 <- files_yt_1[grep(paste("post_",y_1,sep=""),files_yt_1)]
      end <- length(files_yt_1)
      dads_yt_1 <- NULL
      print(libyt_1)
      for (i in c(1:end)){
        
        gg <- read_parquet(paste(libyt_1,files_yt_1[i],sep=""),
                            col_select = c(sexe, siren_empl, nic_empl, nbheur, datdeb, datfin, 
                                           duree, comr, comt, sonde, regt, s_brut, 
                                           ident_s, annee_naiss,pps,dept))
        gg <- gg[s_brut>0 & nbheur>0 & annee_naiss>0  & (siren_empl %in% "000000000")==F,]
        gg$s_brut <- round(gg$s_brut,0)
        colnames(gg) <- paste(colnames(gg),"_1",sep="")
        gg$ident_s <- gg$ident_s_1
        gg$ident_s_1 <- NULL
        
        dads_yt_1 <- rbind(dads_yt_1,gg,fill=T)
        print(files_yt_1[i])
        print(nrow(gg))
        rm(gg)
      }
      dads_yt_1 <- dads_yt_1[order(regt_1,ident_s),]
      row.names(dads_yt_1) <- NULL
      write_dta(dads_yt_1,paste("parquet_dta/dads_",y,"_1.dta",sep=""),version = 14)
      print(nrow(dads_yt_1))
      print(str(dads_yt_1))
      rm(dads_yt_1)
    }

#----------------------------------------------------------#  
# Run the function ####
#----------------------------------------------------------#  
  
  parquet_import(y=2022)
  parquet_import(y=2023)
  
  # parquet_import(y=2024)
  # parquet_import(y=2025)
  # parquet_import(y=2026)
  